library(ggplot2)
library(tidyverse)
library(readr)
library(readxl)
library(rlang)
library(htmlwidgets)
library(plotly)
setwd("~/Library/CloudStorage/Box-Box/FALL2021/ENV170/FINALPROJECT")
MASTER_DATASET <- read_excel("DATASETS/MASTER/MASTER_DATASET.xlsx",
sheet = "Master", col_types = c("text",
"numeric", "numeric", "numeric",
"numeric"))
MASTER_DATASET
## # A tibble: 19 × 5
## ISO Terawatt_Hours Carbon_Emissions Trading_Volume GINI_2020
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ARE 345. 20.8 47.8 42.4
## 2 AUS 248. 15.5 54.8 34.7
## 3 BRA 553. 2.04 25.2 53.1
## 4 CAN 572. 15.5 65.6 33.3
## 5 CHN 6880. 7.41 198. NA
## 6 DEU 559. 8.56 NA 32.4
## 7 ESP 256. 5.52 NA 34.8
## 8 EU 1601. 24.1 204. NA
## 9 FRA 474. 4.62 NA 32.4
## 10 IDN 275. 2.18 8.84 38.3
## 11 IND 1309. 1.80 63.7 35.9
## 12 ITA 312. 5.38 NA 36.2
## 13 JPN 955. 8.74 6.66 32.9
## 14 KOR 563. 12.2 NA NA
## 15 MEX 285 3.74 23.5 45.1
## 16 RUS 997. 11.1 421. 36.6
## 17 TUR 272 5.02 5 42.2
## 18 USA 4194. 15.2 1524. 42.0
## 19 ZAF 229. 7.50 9.92 NA
Using shapiro test to determine the kind of distribution. If P-val is greater than alpha = 0.05, then it is normal.
shapiro.test(MASTER_DATASET$Terawatt_Hours) #p-value = 1.561e-06
##
## Shapiro-Wilk normality test
##
## data: MASTER_DATASET$Terawatt_Hours
## W = 0.55371, p-value = 1.561e-06
shapiro.test(MASTER_DATASET$Carbon_Emissions) #p-value = 0.08199
##
## Shapiro-Wilk normality test
##
## data: MASTER_DATASET$Carbon_Emissions
## W = 0.91242, p-value = 0.08199
shapiro.test(MASTER_DATASET$Trading_Volume) #p-value = 6.233e-06
##
## Shapiro-Wilk normality test
##
## data: MASTER_DATASET$Trading_Volume
## W = 0.50281, p-value = 6.233e-06
#Conclusion: None are normally distributed data
cor.test(x = MASTER_DATASET$Trading_Volume, y = MASTER_DATASET$Carbon_Emissions, method = "spearman") #0.4461538
##
## Spearman's rank correlation rho
##
## data: MASTER_DATASET$Trading_Volume and MASTER_DATASET$Carbon_Emissions
## S = 252, p-value = 0.1119
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.4461538
cor.test(x = MASTER_DATASET$Terawatt_Hours, y = MASTER_DATASET$Carbon_Emissions, method = "spearman") #0.245614
##
## Spearman's rank correlation rho
##
## data: MASTER_DATASET$Terawatt_Hours and MASTER_DATASET$Carbon_Emissions
## S = 860, p-value = 0.3094
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.245614
#Conclusion: limited data so not statistically significant
ggplot(MASTER_DATASET, aes(Terawatt_Hours)) + geom_histogram(bins=10, color= "black", fill="mediumvioletred") + geom_vline(aes(xintercept=mean(Terawatt_Hours)),
color="skyblue3", linetype="dashed", size=1) + labs(x="Terawatt per Hour Consumption")
ggplot(MASTER_DATASET, aes(Carbon_Emissions)) + geom_histogram(bins=10, color= "black", fill="mediumseagreen") + geom_vline(aes(xintercept=mean(Carbon_Emissions)),
color="skyblue3", linetype="dashed", size=1) + labs(x="Carbon Emissions Data")
ggplot(MASTER_DATASET, aes(Trading_Volume)) + geom_histogram(bins=10, color= "black", fill="dodgerblue") + geom_vline(aes(xintercept=mean(Trading_Volume)),
color="skyblue3", linetype="dashed", size=1) +
labs(x="Trading Volume of BTC")
First, we plot CO2 emissions v. electricity consumption in Terawatt per hour
HK <-
ggplot(MASTER_DATASET, aes(x = Terawatt_Hours, y=Carbon_Emissions, label=(ISO))) +
geom_point(colour ="mediumvioletred", size = 2.5) +
labs(x = "Electricity Consumption in Terawatt Hours in 2019", y = "CO2 emissions (metric tons per capita) in 2018")
HKP<- HK + geom_text(size=2.1, hjust = 0.80, vjust=2, nudge_x = 0.05, check_overlap = FALSE)
HKP
#Using HKP for final project
Now, we plot Bitcoin trading volume v. electricity consumption in Terawatt per hour
BB <-
ggplot(MASTER_DATASET, aes(x = Trading_Volume, y=Terawatt_Hours, label=(ISO))) +
geom_point()+
labs(x = "Bitcoin Trading Volume on Online Exchanges by Country 2020. (in millions of U.S. dollars)", y = "Electricity Consumption in Terawatt Hours in 2019") +
geom_vline(
aes(xintercept=mean(Trading_Volume)),
color="blue", linetype="dashed", size=1) +
geom_hline(
aes(yintercept=mean(Terawatt_Hours)),
color="blue", linetype="dashed", size=1)
BBP<- BB + geom_text(size=2.1, hjust = 0.80, vjust=2, nudge_x = 0.05, check_overlap = FALSE)
BBP
Now, we plot CO2 Emissions v. BTC trading volume
GOW <-
ggplot(MASTER_DATASET, aes(x = Trading_Volume, y=Carbon_Emissions, label=(ISO))) +
geom_point(colour = "dodgerblue", size = 2.5) +
labs(x = "Bitcoin Trading Volume on Online Exchanges by Country 2020. (in millions of U.S. dollars)", y = "C02 emissions (metric tons per capita) in 2018")
GOWP<- GOW + geom_text(size=2.1, hjust = 0.80, vjust=2, nudge_x = 0.05, check_overlap = TRUE)
GOWP
#Using GOWP for final project
CO2_4 <- read_csv("DATASETS/CO2_Emissions/CO24CLEAN.csv")
CO2_4
## # A tibble: 266 × 63
## ...1 `Country Name` `1960` `1961` `1962` `1963` `1964` `1965`
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Aruba 205. 209. 226. 215. 208. 185.
## 2 2 Africa Eastern an… 0.906 0.922 0.931 0.941 0.996 1.05
## 3 3 Afghanistan 0.0461 0.0536 0.0737 0.0742 0.0862 0.101
## 4 4 Africa Western an… 0.0909 0.0953 0.0966 0.112 0.133 0.185
## 5 5 Angola 0.101 0.0822 0.211 0.203 0.214 0.206
## 6 6 Albania 1.26 1.37 1.44 1.18 1.11 1.17
## 7 7 Andorra NA NA NA NA NA NA
## 8 8 Arab World 0.609 0.663 0.727 0.853 0.972 1.14
## 9 9 United Arab Emira… 0.119 0.109 0.164 0.176 0.133 0.147
## 10 10 Argentina 2.38 2.46 2.54 2.33 2.55 2.66
## # … with 256 more rows, and 55 more variables: 1966 <dbl>, 1967 <dbl>,
## # 1968 <dbl>, 1969 <dbl>, 1970 <dbl>, 1971 <dbl>, 1972 <dbl>, 1973 <dbl>,
## # 1974 <dbl>, 1975 <dbl>, 1976 <dbl>, 1977 <dbl>, 1978 <dbl>, 1979 <dbl>,
## # 1980 <dbl>, 1981 <dbl>, 1982 <dbl>, 1983 <dbl>, 1984 <dbl>, 1985 <dbl>,
## # 1986 <dbl>, 1987 <dbl>, 1988 <dbl>, 1989 <dbl>, 1990 <dbl>, 1991 <dbl>,
## # 1992 <dbl>, 1993 <dbl>, 1994 <dbl>, 1995 <dbl>, 1996 <dbl>, 1997 <dbl>,
## # 1998 <dbl>, 1999 <dbl>, 2000 <dbl>, 2001 <dbl>, 2002 <dbl>, 2003 <dbl>, …
Extract data from 2009 to 2018 to match timeframe Bitcoin has been in ciculation
CO2_5 <- select(CO2_4, -c(2:50))
CO2_52 <- select(CO2_5, -c(13:14))
Now, get only data for the United States from 2008 to 2018
newdf <- CO2_52 %>% slice(252,) #US row is 252
newdf2<-t(newdf)
newdf2<-as.data.frame(newdf2)
newdf3 <-newdf2 %>% slice(-1,)
colnames(newdf3)[1] <- "CO2 Emissions"
newdf3
## CO2 Emissions
## 2008 18.29481
## 2009 16.81887
## 2010 17.43420
## 2011 16.59940
## 2012 15.77114
## 2013 16.10296
## 2014 16.02638
## 2015 15.53534
## 2016 15.13175
## 2017 14.80588
## 2018 15.24087
#Make row names into first column
d <- newdf3
Years <- rownames(d)
rownames(d) <- NULL
newdf4 <- cbind(Years,d)
newdf4 #Make character as numeric
## Years CO2 Emissions
## 1 2008 18.29481
## 2 2009 16.81887
## 3 2010 17.43420
## 4 2011 16.59940
## 5 2012 15.77114
## 6 2013 16.10296
## 7 2014 16.02638
## 8 2015 15.53534
## 9 2016 15.13175
## 10 2017 14.80588
## 11 2018 15.24087
newdf4$Years<- as.character(newdf4$Years)
newdf4$`CO2 Emissions`<- as.numeric(newdf4$`CO2 Emissions`)
newdf4
## Years CO2 Emissions
## 1 2008 18.29481
## 2 2009 16.81887
## 3 2010 17.43420
## 4 2011 16.59940
## 5 2012 15.77114
## 6 2013 16.10296
## 7 2014 16.02638
## 8 2015 15.53534
## 9 2016 15.13175
## 10 2017 14.80588
## 11 2018 15.24087
Plot CO2 emissions in the US for the years 2008 to 2018
#Plot data
ggplot(data = newdf4) +
geom_point(aes(x = Years, y = `CO2 Emissions`))
p <- ggplot(data = newdf4) + geom_point(aes(x = Years, y = `CO2 Emissions`)) +
theme(axis.text.x = element_text(angle = 0, vjust = 0.5)) +
geom_line(aes(x = Years, y = `CO2 Emissions`, group=1))
p
Now, add layers to this plot
p + xlab("Years") + ylab("Carbon Emissions (in millions)") +
theme(plot.title = element_text(hjust = 0.5))+
theme_bw()
p2 <- p + xlab("Years") + ylab("Carbon Emissions (in millions)") +
theme(plot.title = element_text(hjust = 0.5))+
theme_bw()
p2
ggplotly(p2)
USCarbonEmissions<- plotly::ggplotly(p2)
htmlwidgets::saveWidget(USCarbonEmissions, "USCarbonEmissions.html")
Import yearly data on BTC in circulation for the years 2009-2018.
BTC <- read_excel("~/Library/CloudStorage/Box-Box/FALL2021/ENV170/FINALPROJECT/DATASETS/BTC/BTCincirculation.xlsx",
sheet = "BTC")
colnames(BTC)[colnames(BTC) == "Year"] <- "Years"
BTC$Years <- as.character(BTC$Years)
Plot BTC data over time
p3 <- ggplot(data = BTC) +
geom_line(aes(x = Years, y = BTC, group = 1)) +
geom_point(aes(x = Years, y = BTC))
p3
p4golden <- p3 + xlab("Years") + ylab("Millions of Bitcoins in existence") +
theme(plot.title = element_text(hjust = 0.5))+
theme_bw()
p4golden
#Use plotly::ggplotly() function to make an interactive plot
ggplotly(p4golden)
BitcoinsCirculation<- plotly::ggplotly(p4golden)
htmlwidgets::saveWidget(BitcoinsCirculation, "BitcoinsCirculation.html")
Merge Datasets. R will merge by “Years” by default
us_btc_co2<-left_join(BTC, newdf4)
## Joining, by = "Years"
write.csv(us_btc_co2, "US BTC and CO2 Emissions.csv")
why? I was most pleased with my correlation visualizations between CO2 emissions against electricity consumption and BTC trading volume because they were by far the most complex out of any other visualization I worked on. I decided to label each data point by country ISO (3 letter code word) because it was easier for the reader to know where world economies sit on my measures.
I was also very proud of creating my plotly charts because they required me to use an outsite repository so I could include them into my presentation. It made my charts more interactive and user-friendly.
I didn’t have any problem accessing data for this project because Bitcoin data is widely accessible from countries who use it the most. However, I would have liked to include more data from different countries to have better correlation results and to show where other world economies are with regards to my measures. I found it easier to access information from countries with easier access to information (i.e. the United States and many other European countries)
I envisioned having a side-by-side comparison with how much energy cryptocurrrencies consume vis-a-vis cash, using Bitcoin and the USD respectively. However, this would have required a much more in-depth analysis of two major currencies.
I ultimately decided to stay on the issue of cryptocurrency mining so as to not deviate from my main point——large world economies are the responsible agents for emitting greenhouse gas emissions.
I would have liked to draw comparisons between how much energy a single unit of BTC costs to the environment vis-a-vis the US dollar. Making this side-by-side comparison was at the heart of my final project since the beginning, but due to time constraints I could not pursue this narrative.
I had been working with my work supervisor from the Fletcher School’s Institute for Business in the Global Context to delve into this topic since the beginning of the semester. Having worked as a research assistant at this office, I had a better idea as to how I wanted to tackle this project. While I received my work supervisor’s feedback on my analysis, I would have liked to explore my data a bit further.
Yes! Learning R was a big learning curve for me and I feel much more comfortable with the statistical and visualization aspects of my R repertoire. This project was a big step in my coding and statistical knowledge and my results are reflective of how much care and thought I put into my work. Despite not finding statistically significant correlations with my data, I gained a better insight in the field of data analysis and visualizations.
I am happy with my final project and I know I will come back to this as the foundation that got me started with my R knowledge and understanding of statistics.
Please do!
When I found myself in a challenging situation in class, I relied a lot on my housemates for a better understanding of the material. When I was not able to make it to office hours, they offered help in the best way possible.
Forming a study group goes a long way in really getting on top of class. I often found myself helping my “next door neighbor” sitting next to me, and that made a big difference in how much better they could follow along with the course material. The same goes for when I needed help.
I realized asking for help could solve my issues in a much shorter time than it would have taken me if I wanted to figure things out all by myself.